* PATSTAT AMADEUS MATCH
* Note: based on Patents-Application number in Amadeus and appln_nr_epodoc in Patstat
* Note: only firms with patents are in Amadeus data
* Note: use both amadeus patents from 2015 and 2016 to maximize nr. of firms and patents
* 8 May 2016 Federica

cd "C:\Users\federc\Dropbox\PATSTAT\DATA"
global tmp "D:\TEMP"
global dta "D:\Patstat data"

* ----------------------------------------------
* Preliminaries
* Import BvD and patents from Amadeus
* ----------------------------------------------
* Prepare data for matching: get BvD and patents from Amadeus
* cd "C:\Users\federc\Dropbox\PATSTAT\DATA"
use "D:\patents1a_amadeus.dta", clear
append using "D:\patents1b_amadeus.dta"
drop patentsprioritynumber
saveold amadeus_patents_2015, replace

import delimited C:\Users\federc\Downloads\amadeus_patents.txt, varnames(1) clear
drop ÿþmark
replace bvdidnumber = bvdidnumber[_n-1] if bvdidnumber ==""
saveold amadeus_patents_2016, replace
joinby patentsapplicationnumber using "C:\Users\federc\Dropbox\PATSTAT\DATA\amadeus_patents_2015", unmatched(both)
saveold amadeus_patents_15_16, replace

* ----------------------------------------------
* Match Amadeus and Patstat
* Note: match is based on patentsapplicationnumber in Amadeus and appln_nr_epodoc in Patstat
* ----------------------------------------------
use amadeus_patents_15_16, clear
drop if missing(patentsapplicationnumber) //611,184 obs deleted because key variable for matching is missing
* Construct appln_nr_epodoc in Amadeus to match with tls201 in Patstat
gen x=length(patentsapplicationnumber) - 8
* Get appln_nr_epodoc to match with 
gen appln_nr_epodoc = substr( patentsapplicationnumber ,1 ,x)
replace appln_nr_epodoc = trim(appln_nr_epodoc)
keep companyname bvdidnumber appln_nr_epodoc patentspublicationidentifier
joinby appln_nr_epodoc using "$dta/tls201/tls201", unmatched(master)

tab _merge
/*
                       _merge |      Freq.     Percent        Cum.
------------------------------+-----------------------------------
          only in master data |     43,092        2.94        2.94
both in master and using data |  1,424,194       97.06      100.00
------------------------------+-----------------------------------
                        Total |  1,467,286      100.00
*/
keep companyname bvdidnumber appln_nr_epodoc appln_id appln_auth appln_filing_date patentspublicationidentifier	
replace companyname = companyname[_n-1] if companyname ==""					
saveold amadeus_patents_2015_16_list, replace


* ----------------------------------------------
* Match Amadeus patents list with Efige data
* Note: match is based on BvD ID number
* ----------------------------------------------
use "C:\Users\federc\Dropbox\efige\full_set_amadeus", clear
drop _merge
rename bvdep_id_number bvdidnumber
merge 1:m bvdidnumber using amadeus_patents_2015_16_list, keep(match master)
/*
  Result                           # of obs.
    -----------------------------------------
    not matched                        12,095
        from master                    12,095  (_merge==1)
        from using                          0  (_merge==2)

    matched                            29,894  (_merge==3)
    -----------------------------------------
*/
count if _merge==3 & appln_id !=. // number of matched obs with patents: 29067
saveold amadeus_efige_list, replace


* ----------------------------------------------
* Check company names inconsistencier between different versions of Amadeus
* Note: for 775 observation, same BvD ID number is associated to different company names
* ----------------------------------------------
use amadeus_efige_list, clear
gen namecheck=1 if company_name!=companyname & company_name!="" & companyname!=""
keep company_name companyname namecheck bvdidnumber
order company_name companyname namecheck bvdidnumber
keep if namecheck==1
collapse namecheck, by( bvdidnumber company_name companyname)
*export excel using $tmp\namecheck.xls, firstrow(variables) replace
merge 1:1 company_name companyname bvdidnumber namecheck  using "C:\Users\federc\Dropbox\Amadeus_patent\companyname_mismatch.dta"
replace companyname_mismatch=0 if companyname_mismatch==.
tab companyname_mismatch
/*
companyname |
  _mismatch |      Freq.     Percent        Cum.
------------+-----------------------------------
          0 |        649       86.30       86.30
          1 |        103       13.70      100.00
------------+-----------------------------------
      Total |        752      100.00
*/

* Remove companies with name mismatch (103 out of 14759 -> 14656 left)
drop _merge namecheck
save $tmp\namecheck, replace
use amadeus_efige_list, clear
drop _merge
merge m:1 bvdidnumber using $tmp\namecheck  
drop if companyname_mismatch==1
drop _merge companyname_mismatch
saveold amadeus_efige_list, replace


* ----------------------------------------------
* Create Hrm <-> bvd concordance
* ----------------------------------------------
use amadeus_efige_list, clear
keep bvdidnumber appln_id
* Get person_id from tls207
joinby appln_id using "$dta\tls207\tls207"
* Keep applicants only
keep if applt_seq_nr>0
keep bvdidnumber person_id
*duplicates drop
* Get hrm_l2_id from tls906
merge m:1 person_id using "$dta\tls906\tls906_all", keep(match)
egen main_hrm=mode( hrm_l2_id), by( bvdidnumber ) minmode

by bvdidnumber  hrm_l2_id, sort: gen unq=_n==1
drop if unq==0
collapse (count) nr_hrm=hrm_l2_id, by(bvd main_hrm)
rename main_hrm hrm_l2_id
label var hrm "MAIN (mode) hrm_l2_id"
saveold bvd_hrm_correspondence, replace





